#!/home/chunwei/chunenv/bin/python
# -*- coding: utf-8 -*-
import sys
import pandas as pd
import numpy as np
import argparse
from scipy.optimize import minimize
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score
from sklearn import metrics
import os

# read trainset
def parse_args():
    if len(sys.argv) == 1:
        sys.argv.append('-h')

    parser = argparse.ArgumentParser()
    parser.add_argument('train_paths')
    parser.add_argument('test')
    parser.add_argument('output')
    args = vars(parser.parse_args())
    return args

args = parse_args()
train_paths, test, output_path = args['train_paths'].split(), args['test'], args['output']

# 从每份part中抽取 validate
# 合并起来作为统一的validate
# read trainset
overall_test_x = None
overall_test_y = None
trains = []
for path in train_paths:
    train = pd.read_csv(path)
    labels = train['label']
    train.drop(['label', 'enrollment_id'], axis=1, inplace=True)
    print("Training set has {0[0]} rows and {0[1]} columns".format(train.shape))
    print 'split train and test ...'
    sss = StratifiedShuffleSplit(labels, test_size=0.2, random_state=1234)
    for train_index, test_index in sss:
        break
    train_x, train_y = train.values[train_index], labels.values[train_index]

    test_x, test_y = train.values[test_index], labels.values[test_index]

    if overall_test_x is None:
        overall_test_x = test_x
        overall_test_y = test_y
    else:
        overall_test_x = np.append(overall_test_x, test_x, axis=0)
        overall_test_y = np.append(overall_test_y, test_y, axis=0)

    trains.append([train_x, train_y])


# read testset
test = pd.read_csv(test)
test_ids = test['enrollment_id']
test = test.drop(['enrollment_id'], axis=1)
test = test.values
print("Testing set has {0[0]} rows and {0[1]} columns".format(test.shape))
#print(train.head())


def random_forest(train_x, train_y, test_x, n_estimators=200, random_state=2323):
    print '... training random forest'
    rfc = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state, n_jobs=-1)
    rfc.fit(train_x, train_y)
    predict = rfc.predict_proba(test_x)
    output = rfc.predict_proba(test)[0]
    return rfc, predict, output
    #clfs.append(rfc)

def gbdt(train_x, train_y, test_x, max_depth=6, n_estimators=50):
    print '... training GBDT, max_depth:\t', max_depth, 'n_estimators:\t', n_estimators
    gbdt = GradientBoostingRegressor(
        max_depth = max_depth,
        n_estimators = n_estimators
        )
    gbdt.fit(train_x, train_y)
    predict = gbdt.predict(test_x)
    # 只能输出单个prob
    # 需要自己填充一行zero
    len_predicts = predict.shape[0]
    predict = predict.reshape((1, len_predicts))
    predict[predict < 0.0] = 0.0
    predict[predict > 1.0] = 1.0
    print predict
    array = (1-predict).reshape((1, len_predicts))
    print array
    predict = np.concatenate((array, predict), axis=0).T
    print 'predict:\t', predict.shape, predict
    output = gbdt.predict(test)
    output[output < 0.0] = 0.0
    output[output > 1.0] = 1.0
    print 'feature importance:\t', gbdt.feature_importances_  
    return gbdt, predict, output

clfs = []
predictions = []
final_predictions = []

for train_x, train_y in trains:
    for model, predict, test_output in [
            #random_forest(train_x, train_y, overall_test_x, 50, 2323),
            random_forest(train_x, train_y, overall_test_x, 200, 3323),
            #random_forest(train_x, train_y, overall_test_x, 400, 12323),
            gbdt(train_x, train_y, overall_test_x, 4, 50)
        ]:
        clfs.append(model)
        predictions.append(predict)
        final_predictions.append(test_output)
        score = roc_auc_score(y_true = overall_test_y, y_score = predict[:,1])
        print 'score:\t', score
'''
for clf in clfs:
    predictions.append(clf.predict_proba(test_x))
'''

def log_loss_func(weights):
    ''' scipy minimize will pass the weights as a numpy array '''
    final_prediction = 0
    for weight, prediction in zip(weights, predictions):
            final_prediction += weight*prediction
    return log_loss(overall_test_y, final_prediction)
    
starting_values = [0.5]*len(predictions)
cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
bounds = [(0,1)]*len(predictions)
res = minimize(log_loss_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons)
print('Ensamble Score: {best_score}'.format(best_score=res['fun']))
print('Best Weights: {weights}'.format(weights=res['x']))

weights = res['x']
'''
final_predictions = []
for clf in clfs:
    final_predictions.append(clf.predict_proba(test))
'''
output = final_predictions[0] * res['x'][0]
for i, weight in enumerate(res['x']):
    if i > 0:
        output += final_predictions[i] * res['x'][i]

assert(len(output) == len(test_ids)),"%d:%d" % ( len(output), len(test_ids))

print 'output result ...', output
with open(output_path, 'w') as f:
    for i, p in enumerate(output):
        f.write( "%d,%f\n" % (test_ids[i], p))
